import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("C:/Users/akbar/Documents/R AED/iris.csv")
df
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Sepal.Length 150 non-null float64 1 Sepal.Width 150 non-null float64 2 Petal.Length 150 non-null float64 3 Petal.Width 150 non-null float64 4 Species 150 non-null object dtypes: float64(4), object(1) memory usage: 6.0+ KB
df.describe()
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | |
|---|---|---|---|---|
| count | 150.000000 | 150.000000 | 150.000000 | 150.000000 |
| mean | 5.843333 | 3.057333 | 3.758000 | 1.199333 |
| std | 0.828066 | 0.435866 | 1.765298 | 0.762238 |
| min | 4.300000 | 2.000000 | 1.000000 | 0.100000 |
| 25% | 5.100000 | 2.800000 | 1.600000 | 0.300000 |
| 50% | 5.800000 | 3.000000 | 4.350000 | 1.300000 |
| 75% | 6.400000 | 3.300000 | 5.100000 | 1.800000 |
| max | 7.900000 | 4.400000 | 6.900000 | 2.500000 |
df.isnull().sum()
Sepal.Length 0 Sepal.Width 0 Petal.Length 0 Petal.Width 0 Species 0 dtype: int64
df.duplicated().sum()
1
df.shape
(150, 5)
df[df.duplicated()]
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
|---|---|---|---|---|---|
| 142 | 5.8 | 2.7 | 5.1 | 1.9 | virginica |
df['Species'].value_counts()
setosa 50 versicolor 50 virginica 50 Name: Species, dtype: int64
plt.figure(figsize=(8,6))
plt.title('Species Count')
sns.countplot(x="Species", data=df)
<AxesSubplot:title={'center':'Species Count'}, xlabel='Species', ylabel='count'>
plt.figure(figsize=(8,6))
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x1f739f497f0>
<Figure size 576x432 with 0 Axes>
sns.heatmap(df.corr())
<AxesSubplot:>
df.columns.unique()
Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
'Species'],
dtype='object')
for column_name in df.columns.unique():
if column_name != "Species" :
sns.displot(df[column_name], kde=True)
colors = ['BrBG_r', 'BuGn_r', 'BuPu_r', 'GnBu_r']
i = 0
f,axes=plt.subplots(4, figsize = (15,15))
for col,color in zip(df.columns.unique(), colors):
if col != "Species" :
sns.boxplot(x=col,data=df,ax=axes[i],palette=color)
i = i+1
mp_pal = {"setosa" : "y", "versicolor" : "g", "virginica" : "r"}
i = 0
f,axes=plt.subplots(1,4, figsize = (20,6))
for col in df.columns.unique():
if col != "Species" :
sns.boxplot(x="Species",y = col,data=df,ax=axes[i],palette=mp_pal)
i = i+1
setosa_df = df[df['Species']=="setosa"]
setosa_df['Petal.Length'].quantile(0.97)
1.806
setosa_df['Petal.Length'].quantile(0.03)
1.147
df[(df['Species']=="setosa")&(df['Petal.Length']>setosa_df['Petal.Length'].quantile(0.97))|(df['Petal.Length']<setosa_df['Petal.Length'].quantile(0.03))]
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
|---|---|---|---|---|---|
| 13 | 4.3 | 3.0 | 1.1 | 0.1 | setosa |
| 22 | 4.6 | 3.6 | 1.0 | 0.2 | setosa |
| 24 | 4.8 | 3.4 | 1.9 | 0.2 | setosa |
| 44 | 5.1 | 3.8 | 1.9 | 0.4 | setosa |
df.drop(index=[13,22,24,44]).head(15)
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| 5 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
| 6 | 4.6 | 3.4 | 1.4 | 0.3 | setosa |
| 7 | 5.0 | 3.4 | 1.5 | 0.2 | setosa |
| 8 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
| 9 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
| 10 | 5.4 | 3.7 | 1.5 | 0.2 | setosa |
| 11 | 4.8 | 3.4 | 1.6 | 0.2 | setosa |
| 12 | 4.8 | 3.0 | 1.4 | 0.1 | setosa |
| 14 | 5.8 | 4.0 | 1.2 | 0.2 | setosa |
| 15 | 5.7 | 4.4 | 1.5 | 0.4 | setosa |
!pip3 install pandas-profiling
Requirement already satisfied: pandas-profiling in c:\users\akbar\anaconda3\lib\site-packages (3.2.0)
WARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x00000215AC02C7C0>: Failed to establish a new connection: [WinError 10051] A socket operation was attempted to an unreachable network')': /simple/markupsafe/
WARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x00000215AC02CA60>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/markupsafe/
WARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x00000215AC02CBE0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/markupsafe/
WARNING: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x00000215AC02CE20>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/markupsafe/
WARNING: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x00000215AC05AEE0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/markupsafe/
WARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x00000215AC052040>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/pandas-profiling/
WARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x00000215AC015670>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/pandas-profiling/
WARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x00000215AC04B070>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/pandas-profiling/
WARNING: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x00000215AC04B7C0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/pandas-profiling/
WARNING: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x00000215AC05A640>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/pandas-profiling/
ERROR: Could not find a version that satisfies the requirement markupsafe~=2.1.1 (from pandas-profiling) (from versions: none)
ERROR: No matching distribution found for markupsafe~=2.1.1
Requirement already satisfied: multimethod>=1.4 in c:\users\akbar\anaconda3\lib\site-packages (from pandas-profiling) (1.8) Requirement already satisfied: scipy>=1.4.1 in c:\users\akbar\anaconda3\lib\site-packages (from pandas-profiling) (1.7.1) Requirement already satisfied: pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3 in c:\users\akbar\anaconda3\lib\site-packages (from pandas-profiling) (1.3.4) Requirement already satisfied: missingno>=0.4.2 in c:\users\akbar\anaconda3\lib\site-packages (from pandas-profiling) (0.5.1) Requirement already satisfied: requests>=2.24.0 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from pandas-profiling) (2.25.1) Requirement already satisfied: visions[type_image_path]==0.7.4 in c:\users\akbar\anaconda3\lib\site-packages (from pandas-profiling) (0.7.4) Requirement already satisfied: pydantic>=1.8.1 in c:\users\akbar\anaconda3\lib\site-packages (from pandas-profiling) (1.9.1) Requirement already satisfied: joblib~=1.1.0 in c:\users\akbar\anaconda3\lib\site-packages (from pandas-profiling) (1.1.0) Requirement already satisfied: jinja2>=2.11.1 in c:\users\akbar\anaconda3\lib\site-packages (from pandas-profiling) (2.11.3)
!pip3 install MarkupSafe==2.1.1
WARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x0000023FDAE15850>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/markupsafe/
WARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x0000023FDAE1E1F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/markupsafe/
WARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x0000023FDAE1E400>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/markupsafe/
WARNING: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x0000023FDAE1E5B0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/markupsafe/
WARNING: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x0000023FDAE1E760>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/markupsafe/
ERROR: Could not find a version that satisfies the requirement MarkupSafe==2.1.1 (from versions: none)
ERROR: No matching distribution found for MarkupSafe==2.1.1
pip install --user --upgrade aws-sam-cli
Requirement already satisfied: aws-sam-cli in c:\users\akbar\appdata\roaming\python\python39\site-packages (1.53.0)Note: you may need to restart the kernel to use updated packages.
WARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x000002065D51C2E0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/aws-sam-cli/
WARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x000002065D51C4F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/aws-sam-cli/
WARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x000002065D51C6A0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/aws-sam-cli/
WARNING: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x000002065D51C850>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/aws-sam-cli/
WARNING: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x000002065D51CA00>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/aws-sam-cli/
Requirement already satisfied: click~=7.1 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (7.1.2) Requirement already satisfied: dateparser~=1.0 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (1.1.1) Requirement already satisfied: regex==2021.9.30 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (2021.9.30) Requirement already satisfied: tzlocal==3.0 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (3.0) Requirement already satisfied: typing-extensions==3.10.0.0 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (3.10.0.0) Requirement already satisfied: chevron~=0.12 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (0.14.0) Requirement already satisfied: watchdog==2.1.2 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (2.1.2) Requirement already satisfied: serverlessrepo==0.1.10 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (0.1.10) Requirement already satisfied: Flask~=1.1.4 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (1.1.4) Requirement already satisfied: MarkupSafe==2.0.1 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (2.0.1) Requirement already satisfied: jmespath~=0.10.0 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (0.10.0) Requirement already satisfied: cookiecutter~=2.1.1 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (2.1.1) Requirement already satisfied: requests==2.25.1 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (2.25.1) Requirement already satisfied: aws-sam-translator==1.46.0 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (1.46.0) Requirement already satisfied: PyYAML~=5.3 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (5.4.1) Requirement already satisfied: boto3==1.*,>=1.19.5 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (1.24.33) Requirement already satisfied: docker~=4.2.0 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (4.2.2) Requirement already satisfied: tomlkit==0.7.2 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (0.7.2) Requirement already satisfied: aws-lambda-builders==1.18.0 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from aws-sam-cli) (1.18.0) Requirement already satisfied: setuptools in c:\users\akbar\anaconda3\lib\site-packages (from aws-lambda-builders==1.18.0->aws-sam-cli) (58.0.4) Requirement already satisfied: wheel in c:\users\akbar\anaconda3\lib\site-packages (from aws-lambda-builders==1.18.0->aws-sam-cli) (0.37.0) Requirement already satisfied: jsonschema~=3.2 in c:\users\akbar\anaconda3\lib\site-packages (from aws-sam-translator==1.46.0->aws-sam-cli) (3.2.0) Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from boto3==1.*,>=1.19.5->aws-sam-cli) (0.6.0) Requirement already satisfied: botocore<1.28.0,>=1.27.33 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from boto3==1.*,>=1.19.5->aws-sam-cli) (1.27.33) Requirement already satisfied: chardet<5,>=3.0.2 in c:\users\akbar\anaconda3\lib\site-packages (from requests==2.25.1->aws-sam-cli) (4.0.0) Requirement already satisfied: idna<3,>=2.5 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from requests==2.25.1->aws-sam-cli) (2.10) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\akbar\anaconda3\lib\site-packages (from requests==2.25.1->aws-sam-cli) (1.26.7) Requirement already satisfied: certifi>=2017.4.17 in c:\users\akbar\anaconda3\lib\site-packages (from requests==2.25.1->aws-sam-cli) (2021.10.8) Requirement already satisfied: six~=1.11 in c:\users\akbar\anaconda3\lib\site-packages (from serverlessrepo==0.1.10->aws-sam-cli) (1.16.0) Requirement already satisfied: tzdata in c:\users\akbar\appdata\roaming\python\python39\site-packages (from tzlocal==3.0->aws-sam-cli) (2022.1) Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in c:\users\akbar\anaconda3\lib\site-packages (from botocore<1.28.0,>=1.27.33->boto3==1.*,>=1.19.5->aws-sam-cli) (2.8.2) Requirement already satisfied: python-slugify>=4.0.0 in c:\users\akbar\anaconda3\lib\site-packages (from cookiecutter~=2.1.1->aws-sam-cli) (5.0.2) Requirement already satisfied: binaryornot>=0.4.4 in c:\users\akbar\anaconda3\lib\site-packages (from cookiecutter~=2.1.1->aws-sam-cli) (0.4.4) Requirement already satisfied: jinja2-time>=0.2.0 in c:\users\akbar\anaconda3\lib\site-packages (from cookiecutter~=2.1.1->aws-sam-cli) (0.2.0) Requirement already satisfied: Jinja2<4.0.0,>=2.7 in c:\users\akbar\anaconda3\lib\site-packages (from cookiecutter~=2.1.1->aws-sam-cli) (2.11.3) Requirement already satisfied: pytz in c:\users\akbar\anaconda3\lib\site-packages (from dateparser~=1.0->aws-sam-cli) (2021.3) Requirement already satisfied: pypiwin32==223 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from docker~=4.2.0->aws-sam-cli) (223) Requirement already satisfied: websocket-client>=0.32.0 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from docker~=4.2.0->aws-sam-cli) (1.3.3) Requirement already satisfied: pywin32>=223 in c:\users\akbar\anaconda3\lib\site-packages (from pypiwin32==223->docker~=4.2.0->aws-sam-cli) (228) Requirement already satisfied: Werkzeug<2.0,>=0.15 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from Flask~=1.1.4->aws-sam-cli) (1.0.1) Requirement already satisfied: itsdangerous<2.0,>=0.24 in c:\users\akbar\appdata\roaming\python\python39\site-packages (from Flask~=1.1.4->aws-sam-cli) (1.1.0) Requirement already satisfied: arrow in c:\users\akbar\anaconda3\lib\site-packages (from jinja2-time>=0.2.0->cookiecutter~=2.1.1->aws-sam-cli) (0.13.1) Requirement already satisfied: pyrsistent>=0.14.0 in c:\users\akbar\anaconda3\lib\site-packages (from jsonschema~=3.2->aws-sam-translator==1.46.0->aws-sam-cli) (0.18.0) Requirement already satisfied: attrs>=17.4.0 in c:\users\akbar\anaconda3\lib\site-packages (from jsonschema~=3.2->aws-sam-translator==1.46.0->aws-sam-cli) (21.2.0) Requirement already satisfied: text-unidecode>=1.3 in c:\users\akbar\anaconda3\lib\site-packages (from python-slugify>=4.0.0->cookiecutter~=2.1.1->aws-sam-cli) (1.3)
from pandas_profiling import ProfileReport
dff = pd.read_csv('C:/Users/akbar/Documents/Coding Course/StackUp EDA Python/train.csv')
dff.head(5)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
prof = ProfileReport(dff)
prof.to_notebook_iframe()
dff.dtypes
PassengerId int64 Survived int64 Pclass int64 Name object Sex object Age float64 SibSp int64 Parch int64 Ticket object Fare float64 Cabin object Embarked object dtype: object
dff.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
dff.shape
(891, 12)
dff.describe()
| PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
dff['Survived'].value_counts(normalize = True)
0 0.616162 1 0.383838 Name: Survived, dtype: float64
sns.pairplot(dff,hue='Survived')
<seaborn.axisgrid.PairGrid at 0x1f73d89f8b0>
dff.isna().sum()
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64
plt.figure(figsize=(12,8))
sns.heatmap(dff.isna().transpose(),
cmap="YlGnBu",
cbar_kws={'label':'Missing data'})
<AxesSubplot:>
dff['Ticket'].unique()
array(['A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450',
'330877', '17463', '349909', '347742', '237736', 'PP 9549',
'113783', 'A/5. 2151', '347082', '350406', '248706', '382652',
'244373', '345763', '2649', '239865', '248698', '330923', '113788',
'347077', '2631', '19950', '330959', '349216', 'PC 17601',
'PC 17569', '335677', 'C.A. 24579', 'PC 17604', '113789', '2677',
'A./5. 2152', '345764', '2651', '7546', '11668', '349253',
'SC/Paris 2123', '330958', 'S.C./A.4. 23567', '370371', '14311',
'2662', '349237', '3101295', 'A/4. 39886', 'PC 17572', '2926',
'113509', '19947', 'C.A. 31026', '2697', 'C.A. 34651', 'CA 2144',
'2669', '113572', '36973', '347088', 'PC 17605', '2661',
'C.A. 29395', 'S.P. 3464', '3101281', '315151', 'C.A. 33111',
'S.O.C. 14879', '2680', '1601', '348123', '349208', '374746',
'248738', '364516', '345767', '345779', '330932', '113059',
'SO/C 14885', '3101278', 'W./C. 6608', 'SOTON/OQ 392086', '343275',
'343276', '347466', 'W.E.P. 5734', 'C.A. 2315', '364500', '374910',
'PC 17754', 'PC 17759', '231919', '244367', '349245', '349215',
'35281', '7540', '3101276', '349207', '343120', '312991', '349249',
'371110', '110465', '2665', '324669', '4136', '2627',
'STON/O 2. 3101294', '370369', 'PC 17558', 'A4. 54510', '27267',
'370372', 'C 17369', '2668', '347061', '349241',
'SOTON/O.Q. 3101307', 'A/5. 3337', '228414', 'C.A. 29178',
'SC/PARIS 2133', '11752', '7534', 'PC 17593', '2678', '347081',
'STON/O2. 3101279', '365222', '231945', 'C.A. 33112', '350043',
'230080', '244310', 'S.O.P. 1166', '113776', 'A.5. 11206',
'A/5. 851', 'Fa 265302', 'PC 17597', '35851', 'SOTON/OQ 392090',
'315037', 'CA. 2343', '371362', 'C.A. 33595', '347068', '315093',
'363291', '113505', 'PC 17318', '111240', 'STON/O 2. 3101280',
'17764', '350404', '4133', 'PC 17595', '250653', 'LINE',
'SC/PARIS 2131', '230136', '315153', '113767', '370365', '111428',
'364849', '349247', '234604', '28424', '350046', 'PC 17610',
'368703', '4579', '370370', '248747', '345770', '3101264', '2628',
'A/5 3540', '347054', '2699', '367231', '112277',
'SOTON/O.Q. 3101311', 'F.C.C. 13528', 'A/5 21174', '250646',
'367229', '35273', 'STON/O2. 3101283', '243847', '11813',
'W/C 14208', 'SOTON/OQ 392089', '220367', '21440', '349234',
'19943', 'PP 4348', 'SW/PP 751', 'A/5 21173', '236171', '347067',
'237442', 'C.A. 29566', 'W./C. 6609', '26707', 'C.A. 31921',
'28665', 'SCO/W 1585', '367230', 'W./C. 14263',
'STON/O 2. 3101275', '2694', '19928', '347071', '250649', '11751',
'244252', '362316', '113514', 'A/5. 3336', '370129', '2650',
'PC 17585', '110152', 'PC 17755', '230433', '384461', '110413',
'112059', '382649', 'C.A. 17248', '347083', 'PC 17582', 'PC 17760',
'113798', '250644', 'PC 17596', '370375', '13502', '347073',
'239853', 'C.A. 2673', '336439', '347464', '345778', 'A/5. 10482',
'113056', '349239', '345774', '349206', '237798', '370373',
'19877', '11967', 'SC/Paris 2163', '349236', '349233', 'PC 17612',
'2693', '113781', '19988', '9234', '367226', '226593', 'A/5 2466',
'17421', 'PC 17758', 'P/PP 3381', 'PC 17485', '11767', 'PC 17608',
'250651', '349243', 'F.C.C. 13529', '347470', '29011', '36928',
'16966', 'A/5 21172', '349219', '234818', '345364', '28551',
'111361', '113043', 'PC 17611', '349225', '7598', '113784',
'248740', '244361', '229236', '248733', '31418', '386525',
'C.A. 37671', '315088', '7267', '113510', '2695', '2647', '345783',
'237671', '330931', '330980', 'SC/PARIS 2167', '2691',
'SOTON/O.Q. 3101310', 'C 7076', '110813', '2626', '14313',
'PC 17477', '11765', '3101267', '323951', 'C 7077', '113503',
'2648', '347069', 'PC 17757', '2653', 'STON/O 2. 3101293',
'349227', '27849', '367655', 'SC 1748', '113760', '350034',
'3101277', '350052', '350407', '28403', '244278', '240929',
'STON/O 2. 3101289', '341826', '4137', '315096', '28664', '347064',
'29106', '312992', '349222', '394140', 'STON/O 2. 3101269',
'343095', '28220', '250652', '28228', '345773', '349254',
'A/5. 13032', '315082', '347080', 'A/4. 34244', '2003', '250655',
'364851', 'SOTON/O.Q. 392078', '110564', '376564', 'SC/AH 3085',
'STON/O 2. 3101274', '13507', 'C.A. 18723', '345769', '347076',
'230434', '65306', '33638', '113794', '2666', '113786', '65303',
'113051', '17453', 'A/5 2817', '349240', '13509', '17464',
'F.C.C. 13531', '371060', '19952', '364506', '111320', '234360',
'A/S 2816', 'SOTON/O.Q. 3101306', '113792', '36209', '323592',
'315089', 'SC/AH Basle 541', '7553', '31027', '3460', '350060',
'3101298', '239854', 'A/5 3594', '4134', '11771', 'A.5. 18509',
'65304', 'SOTON/OQ 3101317', '113787', 'PC 17609', 'A/4 45380',
'36947', 'C.A. 6212', '350035', '315086', '364846', '330909',
'4135', '26360', '111427', 'C 4001', '382651', 'SOTON/OQ 3101316',
'PC 17473', 'PC 17603', '349209', '36967', 'C.A. 34260', '226875',
'349242', '12749', '349252', '2624', '2700', '367232',
'W./C. 14258', 'PC 17483', '3101296', '29104', '2641', '2690',
'315084', '113050', 'PC 17761', '364498', '13568', 'WE/P 5735',
'2908', '693', 'SC/PARIS 2146', '244358', '330979', '2620',
'347085', '113807', '11755', '345572', '372622', '349251',
'218629', 'SOTON/OQ 392082', 'SOTON/O.Q. 392087', 'A/4 48871',
'349205', '2686', '350417', 'S.W./PP 752', '11769', 'PC 17474',
'14312', 'A/4. 20589', '358585', '243880', '2689',
'STON/O 2. 3101286', '237789', '13049', '3411', '237565', '13567',
'14973', 'A./5. 3235', 'STON/O 2. 3101273', 'A/5 3902', '364848',
'SC/AH 29037', '248727', '2664', '349214', '113796', '364511',
'111426', '349910', '349246', '113804', 'SOTON/O.Q. 3101305',
'370377', '364512', '220845', '31028', '2659', '11753', '350029',
'54636', '36963', '219533', '349224', '334912', '27042', '347743',
'13214', '112052', '237668', 'STON/O 2. 3101292', '350050',
'349231', '13213', 'S.O./P.P. 751', 'CA. 2314', '349221', '8475',
'330919', '365226', '349223', '29751', '2623', '5727', '349210',
'STON/O 2. 3101285', '234686', '312993', 'A/5 3536', '19996',
'29750', 'F.C. 12750', 'C.A. 24580', '244270', '239856', '349912',
'342826', '4138', '330935', '6563', '349228', '350036', '24160',
'17474', '349256', '2672', '113800', '248731', '363592', '35852',
'348121', 'PC 17475', '36864', '350025', '223596', 'PC 17476',
'PC 17482', '113028', '7545', '250647', '348124', '34218', '36568',
'347062', '350048', '12233', '250643', '113806', '315094', '36866',
'236853', 'STON/O2. 3101271', '239855', '28425', '233639',
'349201', '349218', '16988', '376566', 'STON/O 2. 3101288',
'250648', '113773', '335097', '29103', '392096', '345780',
'349204', '350042', '29108', '363294', 'SOTON/O2 3101272', '2663',
'347074', '112379', '364850', '8471', '345781', '350047',
'S.O./P.P. 3', '2674', '29105', '347078', '383121', '36865',
'2687', '113501', 'W./C. 6607', 'SOTON/O.Q. 3101312', '374887',
'3101265', '12460', 'PC 17600', '349203', '28213', '17465',
'349244', '2685', '2625', '347089', '347063', '112050', '347087',
'248723', '3474', '28206', '364499', '112058', 'STON/O2. 3101290',
'S.C./PARIS 2079', 'C 7075', '315098', '19972', '368323', '367228',
'2671', '347468', '2223', 'PC 17756', '315097', '392092', '11774',
'SOTON/O2 3101287', '2683', '315090', 'C.A. 5547', '349213',
'347060', 'PC 17592', '392091', '113055', '2629', '350026',
'28134', '17466', '233866', '236852', 'SC/PARIS 2149', 'PC 17590',
'345777', '349248', '695', '345765', '2667', '349212', '349217',
'349257', '7552', 'C.A./SOTON 34068', 'SOTON/OQ 392076', '211536',
'112053', '111369', '370376'], dtype=object)
dff['Sex'].unique()
array(['male', 'female'], dtype=object)
dff.drop(columns=['PassengerId','Cabin','Ticket'], inplace=True)
plt.figure(figsize=(12,8))
sns.set_style("whitegrid")
sns.countplot(x='Survived', data=dff)
<AxesSubplot:xlabel='Survived', ylabel='count'>
plt.figure(figsize=(12,8))
sns.set_style("whitegrid")
sns.countplot(x='Survived',hue='Sex',data=dff)
<AxesSubplot:xlabel='Survived', ylabel='count'>
plt.figure(figsize=(12,8))
sns.set_style("whitegrid")
sns.countplot(x='Survived',hue='Pclass',data=dff)
<AxesSubplot:xlabel='Survived', ylabel='count'>
sns.displot(dff['Age'].dropna(),kde=False,bins=20,color='darkgreen')
<seaborn.axisgrid.FacetGrid at 0x1f73fb06070>
plt.figure(figsize=(12,8))
sns.set_style("whitegrid")
sns.countplot(x='Survived',hue='SibSp',data=dff)
<AxesSubplot:xlabel='Survived', ylabel='count'>
dff['Fare'].hist(color='darkgreen',
bins=30,
figsize=(12,8))
<AxesSubplot:>
plt.figure(figsize=(12,8))
sns.histplot(x='Fare',hue='Survived',data=dff,bins=20)
<AxesSubplot:xlabel='Fare', ylabel='Count'>
plt.figure(figsize=(16,12))
sns.boxplot(x='Pclass',y='Age',
hue=dff[['Sex','Survived']].apply(tuple, axis=1),
data=dff,palette='PuOr_r')
<AxesSubplot:xlabel='Pclass', ylabel='Age'>
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
Survived = cols[2]
gender = cols[3]
if pd.isnull(Age):
if Pclass == 1 and Survived == 0 and gender == 'male':
return 46
elif Pclass == 1 and Survived == 1 and gender == 'male':
return 35
elif Pclass == 1 and Survived == 0 and gender == 'female':
return 25
elif Pclass == 1 and Survived == 1 and gender == 'female':
return 35
elif Pclass == 2 and Survived == 0 and gender == 'male':
return 31
elif Pclass == 2 and Survived == 1 and gender == 'male':
return 3
elif Pclass == 2 and Survived == 0 and gender == 'female':
return 33
elif Pclass == 2 and Survived == 1 and gender == 'female':
return 28
elif Pclass == 3 and Survived == 0 and gender == 'male':
return 25
elif Pclass == 3 and Survived == 1 and gender == 'male':
return 25
elif Pclass == 3 and Survived == 0 and gender == 'female':
return 22
elif Pclass == 3 and Survived == 1 and gender == 'female':
return 19
else:
return(Age)
else:
return(Age)
dff['Age'] = dff[['Age','Pclass','Survived','Sex']].apply(impute_age,axis=1)
dff.head(15)
| Survived | Pclass | Name | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | 7.2500 | S |
| 1 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | 71.2833 | C |
| 2 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | 7.9250 | S |
| 3 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 53.1000 | S |
| 4 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 8.0500 | S |
| 5 | 0 | 3 | Moran, Mr. James | male | 25.0 | 0 | 0 | 8.4583 | Q |
| 6 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 51.8625 | S |
| 7 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 21.0750 | S |
| 8 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 11.1333 | S |
| 9 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 30.0708 | C |
| 10 | 1 | 3 | Sandstrom, Miss. Marguerite Rut | female | 4.0 | 1 | 1 | 16.7000 | S |
| 11 | 1 | 1 | Bonnell, Miss. Elizabeth | female | 58.0 | 0 | 0 | 26.5500 | S |
| 12 | 0 | 3 | Saundercock, Mr. William Henry | male | 20.0 | 0 | 0 | 8.0500 | S |
| 13 | 0 | 3 | Andersson, Mr. Anders Johan | male | 39.0 | 1 | 5 | 31.2750 | S |
| 14 | 0 | 3 | Vestrom, Miss. Hulda Amanda Adolfina | female | 14.0 | 0 | 0 | 7.8542 | S |
dff.isnull().sum()
Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Fare 0 Embarked 2 dtype: int64
def name_title(name):
if('Mr') in name:
return('Mr')
elif('Mrs') in name:
return('Mrs')
elif('Miss') in name:
return('Miss')
elif('Rev') in name:
return('Rev')
else:
return(np.nan)
dff['title'] = dff['Name'].apply(name_title)
dff[dff['title'].isnull()]
| Survived | Pclass | Name | Sex | Age | SibSp | Parch | Fare | Embarked | title | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.00 | 3 | 1 | 21.0750 | S | NaN |
| 16 | 0 | 3 | Rice, Master. Eugene | male | 2.00 | 4 | 1 | 29.1250 | Q | NaN |
| 30 | 0 | 1 | Uruchurtu, Don. Manuel E | male | 40.00 | 0 | 0 | 27.7208 | C | NaN |
| 50 | 0 | 3 | Panula, Master. Juha Niilo | male | 7.00 | 4 | 1 | 39.6875 | S | NaN |
| 59 | 0 | 3 | Goodwin, Master. William Frederick | male | 11.00 | 5 | 2 | 46.9000 | S | NaN |
| 63 | 0 | 3 | Skoog, Master. Harald | male | 4.00 | 3 | 2 | 27.9000 | S | NaN |
| 65 | 1 | 3 | Moubarek, Master. Gerios | male | 25.00 | 1 | 1 | 15.2458 | C | NaN |
| 78 | 1 | 2 | Caldwell, Master. Alden Gates | male | 0.83 | 0 | 2 | 29.0000 | S | NaN |
| 125 | 1 | 3 | Nicola-Yarred, Master. Elias | male | 12.00 | 1 | 0 | 11.2417 | C | NaN |
| 159 | 0 | 3 | Sage, Master. Thomas Henry | male | 25.00 | 8 | 2 | 69.5500 | S | NaN |
| 164 | 0 | 3 | Panula, Master. Eino Viljami | male | 1.00 | 4 | 1 | 39.6875 | S | NaN |
| 165 | 1 | 3 | Goldsmith, Master. Frank John William "Frankie" | male | 9.00 | 0 | 2 | 20.5250 | S | NaN |
| 171 | 0 | 3 | Rice, Master. Arthur | male | 4.00 | 4 | 1 | 29.1250 | Q | NaN |
| 176 | 0 | 3 | Lefebre, Master. Henry Forbes | male | 25.00 | 3 | 1 | 25.4667 | S | NaN |
| 182 | 0 | 3 | Asplund, Master. Clarence Gustaf Hugo | male | 9.00 | 4 | 2 | 31.3875 | S | NaN |
| 183 | 1 | 2 | Becker, Master. Richard F | male | 1.00 | 2 | 1 | 39.0000 | S | NaN |
| 193 | 1 | 2 | Navratil, Master. Michel M | male | 3.00 | 1 | 1 | 26.0000 | S | NaN |
| 245 | 0 | 1 | Minahan, Dr. William Edward | male | 44.00 | 2 | 0 | 90.0000 | Q | NaN |
| 261 | 1 | 3 | Asplund, Master. Edvin Rojj Felix | male | 3.00 | 4 | 2 | 31.3875 | S | NaN |
| 278 | 0 | 3 | Rice, Master. Eric | male | 7.00 | 4 | 1 | 29.1250 | Q | NaN |
| 305 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.92 | 1 | 2 | 151.5500 | S | NaN |
| 317 | 0 | 2 | Moraweck, Dr. Ernest | male | 54.00 | 0 | 0 | 14.0000 | S | NaN |
| 340 | 1 | 2 | Navratil, Master. Edmond Roger | male | 2.00 | 1 | 1 | 26.0000 | S | NaN |
| 348 | 1 | 3 | Coutts, Master. William Loch "William" | male | 3.00 | 1 | 1 | 15.9000 | S | NaN |
| 369 | 1 | 1 | Aubart, Mme. Leontine Pauline | female | 24.00 | 0 | 0 | 69.3000 | C | NaN |
| 386 | 0 | 3 | Goodwin, Master. Sidney Leonard | male | 1.00 | 5 | 2 | 46.9000 | S | NaN |
| 398 | 0 | 2 | Pain, Dr. Alfred | male | 23.00 | 0 | 0 | 10.5000 | S | NaN |
| 407 | 1 | 2 | Richards, Master. William Rowe | male | 3.00 | 1 | 1 | 18.7500 | S | NaN |
| 443 | 1 | 2 | Reynaldo, Ms. Encarnacion | female | 28.00 | 0 | 0 | 13.0000 | S | NaN |
| 445 | 1 | 1 | Dodge, Master. Washington | male | 4.00 | 0 | 2 | 81.8583 | S | NaN |
| 449 | 1 | 1 | Peuchen, Major. Arthur Godfrey | male | 52.00 | 0 | 0 | 30.5000 | S | NaN |
| 480 | 0 | 3 | Goodwin, Master. Harold Victor | male | 9.00 | 5 | 2 | 46.9000 | S | NaN |
| 489 | 1 | 3 | Coutts, Master. Eden Leslie "Neville" | male | 9.00 | 1 | 1 | 15.9000 | S | NaN |
| 536 | 0 | 1 | Butt, Major. Archibald Willingham | male | 45.00 | 0 | 0 | 26.5500 | S | NaN |
| 549 | 1 | 2 | Davies, Master. John Morgan Jr | male | 8.00 | 1 | 1 | 36.7500 | S | NaN |
| 632 | 1 | 1 | Stahelin-Maeglin, Dr. Max | male | 32.00 | 0 | 0 | 30.5000 | C | NaN |
| 641 | 1 | 1 | Sagesser, Mlle. Emma | female | 24.00 | 0 | 0 | 69.3000 | C | NaN |
| 647 | 1 | 1 | Simonius-Blumer, Col. Oberst Alfons | male | 56.00 | 0 | 0 | 35.5000 | C | NaN |
| 660 | 1 | 1 | Frauenthal, Dr. Henry William | male | 50.00 | 2 | 0 | 133.6500 | S | NaN |
| 694 | 0 | 1 | Weir, Col. John | male | 60.00 | 0 | 0 | 26.5500 | S | NaN |
| 709 | 1 | 3 | Moubarek, Master. Halim Gonios ("William George") | male | 25.00 | 1 | 1 | 15.2458 | C | NaN |
| 745 | 0 | 1 | Crosby, Capt. Edward Gifford | male | 70.00 | 1 | 1 | 71.0000 | S | NaN |
| 751 | 1 | 3 | Moor, Master. Meier | male | 6.00 | 0 | 1 | 12.4750 | S | NaN |
| 755 | 1 | 2 | Hamalainen, Master. Viljo | male | 0.67 | 1 | 1 | 14.5000 | S | NaN |
| 759 | 1 | 1 | Rothes, the Countess. of (Lucy Noel Martha Dye... | female | 33.00 | 0 | 0 | 86.5000 | S | NaN |
| 766 | 0 | 1 | Brewe, Dr. Arthur Jackson | male | 46.00 | 0 | 0 | 39.6000 | C | NaN |
| 787 | 0 | 3 | Rice, Master. George Hugh | male | 8.00 | 4 | 1 | 29.1250 | Q | NaN |
| 788 | 1 | 3 | Dean, Master. Bertram Vere | male | 1.00 | 1 | 2 | 20.5750 | S | NaN |
| 796 | 1 | 1 | Leader, Dr. Alice (Farnham) | female | 49.00 | 0 | 0 | 25.9292 | S | NaN |
| 802 | 1 | 1 | Carter, Master. William Thornton II | male | 11.00 | 1 | 2 | 120.0000 | S | NaN |
| 803 | 1 | 3 | Thomas, Master. Assad Alexander | male | 0.42 | 0 | 1 | 8.5167 | C | NaN |
| 819 | 0 | 3 | Skoog, Master. Karl Thorsten | male | 10.00 | 3 | 2 | 27.9000 | S | NaN |
| 822 | 0 | 1 | Reuchlin, Jonkheer. John George | male | 38.00 | 0 | 0 | 0.0000 | S | NaN |
| 824 | 0 | 3 | Panula, Master. Urho Abraham | male | 2.00 | 4 | 1 | 39.6875 | S | NaN |
| 827 | 1 | 2 | Mallet, Master. Andre | male | 1.00 | 0 | 2 | 37.0042 | C | NaN |
| 831 | 1 | 2 | Richards, Master. George Sibley | male | 0.83 | 1 | 1 | 18.7500 | S | NaN |
| 850 | 0 | 3 | Andersson, Master. Sigvard Harald Elias | male | 4.00 | 4 | 2 | 31.2750 | S | NaN |
| 869 | 1 | 3 | Johnson, Master. Harold Theodor | male | 4.00 | 1 | 1 | 11.1333 | S | NaN |
def name_title(name):
if('Mr') in name:
return('Mr')
elif('Mrs') in name:
return('Mrs')
elif('Miss') in name:
return('Miss')
elif('Ms') in name:
return('Miss')
elif('Rev') in name:
return('Rev')
elif('Master') in name:
return('Master')
elif('Don') in name:
return('Don')
elif('Dr') in name:
return('Dr')
elif('Mme') in name:
return('Mme')
elif('Major') in name:
return('Major')
elif('Mlle') in name:
return('Mlle')
elif('Col') in name:
return('Col')
elif('Countess') in name:
return('Countess')
elif('Leader') in name:
return('Leader')
elif('Jonkheer') in name:
return('Jonkheer')
elif('Capt') in name:
return('Capt')
else:
return(np.nan)
dff['title'] = dff['Name'].apply(name_title)
dff[dff['title'].isnull()]
| Survived | Pclass | Name | Sex | Age | SibSp | Parch | Fare | Embarked | title |
|---|
dff.head(10)
| Survived | Pclass | Name | Sex | Age | SibSp | Parch | Fare | Embarked | title | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | 7.2500 | S | Mr |
| 1 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | 71.2833 | C | Mr |
| 2 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | 7.9250 | S | Miss |
| 3 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 53.1000 | S | Mr |
| 4 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 8.0500 | S | Mr |
| 5 | 0 | 3 | Moran, Mr. James | male | 25.0 | 0 | 0 | 8.4583 | Q | Mr |
| 6 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 51.8625 | S | Mr |
| 7 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 21.0750 | S | Master |
| 8 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 11.1333 | S | Mr |
| 9 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 30.0708 | C | Mr |
plt.figure(figsize=(12,8))
sns.set_style("whitegrid")
sns.countplot(x='title', data=dff)
<AxesSubplot:xlabel='title', ylabel='count'>
gender = pd.get_dummies(dff['Sex'],drop_first=True)
embark = pd.get_dummies(dff['Embarked'],drop_first=True)
title = pd.get_dummies(dff['title'])
dff.drop(['Sex','Embarked','Name','title'], axis=1, inplace=True)
ds = pd.concat([dff,gender,embark,title], axis=1)
ds.head(10)
| Survived | Pclass | Age | SibSp | Parch | Fare | male | Q | S | Capt | ... | Don | Dr | Jonkheer | Major | Master | Miss | Mlle | Mme | Mr | Rev | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 22.0 | 1 | 0 | 7.2500 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 1 | 3 | 26.0 | 0 | 0 | 7.9250 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 0 | 3 | 35.0 | 0 | 0 | 8.0500 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 5 | 0 | 3 | 25.0 | 0 | 0 | 8.4583 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 6 | 0 | 1 | 54.0 | 0 | 0 | 51.8625 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 7 | 0 | 3 | 2.0 | 3 | 1 | 21.0750 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 8 | 1 | 3 | 27.0 | 0 | 2 | 11.1333 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 9 | 1 | 2 | 14.0 | 1 | 0 | 30.0708 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
10 rows × 22 columns
ds.drop(['Jonkheer'],axis=1,inplace=True)
plt.figure(figsize=(16,12))
sns.heatmap(ds.corr())
<AxesSubplot:>
ds.to_csv('C:/Users/akbar/Documents/Coding Course/StackUp EDA Python/cleaned.csv')